This is an R Markdown Notebook. When you execute code within the notebook, the results appear beneath the code.

If you are viewing it in a browser (HTML file), the original code that can be found and executed in the associated .Rmd file.

Try executing this chunk by clicking the Run button within the chunk or by placing your cursor inside it and pressing Cmd+Shift+Enter.

Supporting Libaries

# Used for calculating AUC and more
# library(mltools)
# library(data.table)

# SOM Support libaries
library(kohonen)
library(dummies)
## dummies-1.5.6 provided by Decision Patterns
library(ggplot2)
library(sp)
library(maptools)
## Checking rgeos availability: TRUE
## Please note that 'maptools' will be retired by the end of 2023,
## plan transition at your earliest convenience;
## some functionality will be moved to 'sp'.
library(reshape2)
library(rgeos)
## rgeos version: 0.5-9, (SVN revision 684)
##  GEOS runtime version: 3.9.1-CAPI-1.14.2 
##  Please note that rgeos will be retired by the end of 2023,
## plan transition to sf functions using GEOS at your earliest convenience.
##  GEOS using OverlayNG
##  Linking to sp version: 1.4-6 
##  Polygon checking: TRUE

Setting support functions and working directory

set.seed(10)
# ONLY IF RUNNING NATIVELY, ELSE EXECUTE NEXT BLOCK IN R NOTEBOOK
# setwd("~/OneDrive/University/INFO411_DataMining/Project/RProject/Project4/DataSet")

### Helping Functions
binningSpam <- function(value){
  if (value> 0.5){
    return(factor("Spam"))
  }
  else{
    return(factor("NonSpam"))
  }
}

binningSpamNum <- function(value){
  if (value> 0.5){
    return(1)
  }
  else{
    return(1)
  }
}

Importing Data and Visualising

We’ll import in the following data sets: - trainRAW - The labels we’ll use for training that shows which host IDs are “Spam”, “not Spam”, or “Undecided” - testRAW - The labels we’ll use for the testing that shows which host IDs are “Spam”, “not Spam”, or “Undecided” - linkRAW - Domains and their associated linked attributes. - The data here is not normalized, not transformed. - linkTransfromedRAW - Domains and their associated linked attributes. - The data here is normalized via Log(10), and has multiple transformations performed and added as new columns - For example, there are columns where the PageRank of the Homepage is divided by the next linked PageRank

trainRAW <- data.frame(read.csv("webspam-uk2007-set1-1-10_TRAINING/WEBSPAM-UK2007-SET1-labels.txt", header= F, sep=" ", dec="."))
testRAW <- data.frame(read.csv("webspam-uk2007-set2-1-10_TEST/WEBSPAM-UK2007-SET2-labels.txt", header= F, sep=" ", dec="."))
linkRAW <- data.frame(read.csv("uk-2007-05.link_based_features.csv", header=T))
linkTransfromedRAW <- data.frame(read.csv("uk-2007-05.link_based_features_transformed.csv", header=T))

Analysis of the linked dataset

# Original Link Train DS
linkTrain <- merge(trainRAW,linkRAW,by.x = "V1", by.y="X.hostid")
linkTrain <- linkTrain[,c(-1,-2,-4,-5)]
linkTrain$V3 <- as.numeric(as.character(linkTrain$V3))
## Warning: NAs introduced by coercion
linkTrain <- na.omit(linkTrain)

# Correlation for the original Train DS. Correlation is not strong at all.
corTable <- abs(cor(linkTrain,y=linkTrain$V3))
corTable = corTable[order(corTable, decreasing = T),,drop=F]

head(corTable,20)
##                              [,1]
## V3                     1.00000000
## truncatedpagerank_4_hp 0.06409890
## truncatedpagerank_1_hp 0.06373734
## truncatedpagerank_3_hp 0.06356978
## truncatedpagerank_2_hp 0.06344003
## pagerank_hp            0.06308755
## pagerank_mp            0.05911198
## truncatedpagerank_4_mp 0.05338514
## truncatedpagerank_3_mp 0.05316211
## truncatedpagerank_2_mp 0.05280514
## truncatedpagerank_1_mp 0.05162838
## siteneighbors_1_hp     0.05042254
## prsigma_hp             0.04911449
## siteneighbors_4_hp     0.04650915
## eq_hp_mp               0.04435796
## assortativity_mp       0.04260146
## siteneighbors_3_hp     0.03862848
## siteneighbors_4_mp     0.03682804
## prsigma_mp             0.03482937
## siteneighbors_3_mp     0.03445648

From a quick glance, we can see that none of the data has any strong correlation with the result (at least without transformation).

Let’s take a look at the other dataset.

Analysis of the linked dataset (transformed)

# Loading the Train DS that has been logged
linkTransformedTrain <- merge(trainRAW,linkTransfromedRAW,by.x = "V1", by.y="X.hostid")
linkTransformedTrain <- linkTransformedTrain[,c(-1,-2,-4)]
linkTransformedTrain$V3 <- as.numeric(as.character(linkTransformedTrain$V3))
## Warning: NAs introduced by coercion
linkTransformedTrain <- na.omit(linkTransformedTrain)

# Correlation table. 
corTable2 <- abs(cor(linkTransformedTrain,y=linkTransformedTrain$V3))
corTable2 = corTable2[order(corTable2, decreasing = T),,drop=F]

head(corTable2,21)
##                                                                                                                                                                                  [,1]
## V3                                                                                                                                                                          1.0000000
## log_OP_truncatedpagerank_1_mp_div_pagerank_mp_CP_                                                                                                                           0.1515353
## log_OP_outdegree_mp_div_pagerank_mp_CP_                                                                                                                                     0.1505800
## log_OP_truncatedpagerank_2_mp_div_pagerank_mp_CP_                                                                                                                           0.1488675
## L_outdegree_mp                                                                                                                                                              0.1481146
## L_avgin_of_out_mp                                                                                                                                                           0.1446143
## log_OP_truncatedpagerank_3_mp_div_pagerank_mp_CP_                                                                                                                           0.1435711
## log_OP_outdegree_hp_div_pagerank_hp_CP_                                                                                                                                     0.1432103
## log_OP_avgin_of_out_mp_mul_outdegree_mp_CP_                                                                                                                                 0.1422249
## log_OP_truncatedpagerank_4_mp_div_pagerank_mp_CP_                                                                                                                           0.1409383
## L_outdegree_hp                                                                                                                                                              0.1404630
## L_avgin_of_out_hp                                                                                                                                                           0.1395381
## log_OP_avgin_of_out_hp_mul_outdegree_hp_CP_                                                                                                                                 0.1359523
## log_OP_min_OP_truncatedpagerank_2_hp_div_truncatedpagerank_1_hp_truncatedpagerank_3_hp_div_truncatedpagerank_2_hp_truncatedpagerank_4_hp_div_truncatedpagerank_3_hp_CP__CP_ 0.1189911
## L_prsigma_hp                                                                                                                                                                0.1188065
## log_OP_truncatedpagerank_4_hp_div_pagerank_hp_CP_                                                                                                                           0.1178506
## log_OP_truncatedpagerank_3_hp_div_pagerank_hp_CP_                                                                                                                           0.1177164
## log_OP_truncatedpagerank_2_hp_div_pagerank_hp_CP_                                                                                                                           0.1172431
## log_OP_prsigma_hp_div_pagerank_hp_CP_                                                                                                                                       0.1168947
## log_OP_truncatedpagerank_1_hp_div_pagerank_hp_CP_                                                                                                                           0.1123874
## L_prsigma_mp                                                                                                                                                                0.1097668
headhead <- head(corTable2,21)
headnames <- row.names(headhead)
headnames <- headnames[2:11]
headnames20 <- row.names(headhead)[2:21]

sum(linkTransformedTrain$V3 <0.5)
## [1] 3776
sum(linkTransformedTrain$V3 > 0.5)
## [1] 222

Looking at the correlation of the corTable, we find a lot more attributes that have stronger correlation.

We also see that the distribution of Spam to Non-Spam is very skewed, thus we’ll try to balance the dataset before visualizing the top 10 correlated.

We’ll utilize an oversampling technique to try to ensure we have enough samples for training. (2x the amount of “Spam” we have)

### OVERSAMPLING and splitting
# Undecided (0.5) are dropped
# I tried undersampling... no distinct change

linkTransformedTrainSpam <- subset(linkTransformedTrain, V3 > 0.5)
linkTransformedTrainNotSpam <- subset(linkTransformedTrain, V3 < 0.5)
linkTransformedTrainSpam <- linkTransformedTrainSpam[sample(1:nrow(linkTransformedTrainSpam), size=444, replace=T),]
linkTransformedTrainNotSpam <- linkTransformedTrainNotSpam[sample(1:nrow(linkTransformedTrainNotSpam), size=444, replace=F),]

linkTransformedBalanced = rbind(linkTransformedTrainSpam, linkTransformedTrainNotSpam)

linkTransformedBalanced$binnedY <- sapply(linkTransformedBalanced$V3, binningSpam)
#linkTransformedTest$binnedY <- sapply(linkTransformedTest$V3, binningSpam)

Plot visualization

plot(linkTransformedBalanced$log_OP_truncatedpagerank_1_mp_div_pagerank_mp_CP_,linkTransformedBalanced$binnedY)

plot(linkTransformedBalanced$log_OP_truncatedpagerank_2_mp_div_pagerank_mp_CP_,linkTransformedBalanced$binnedY)

plot(linkTransformedBalanced$log_OP_outdegree_mp_div_pagerank_mp_CP_,linkTransformedBalanced$binnedY)

plot(linkTransformedBalanced$L_outdegree_mp,linkTransformedBalanced$binnedY)

plot(linkTransformedBalanced$log_OP_truncatedpagerank_3_mp_div_pagerank_mp_CP_,linkTransformedBalanced$binnedY)

plot(linkTransformedBalanced$L_avgin_of_out_mp,linkTransformedBalanced$binnedY)

plot(linkTransformedBalanced$log_OP_avgin_of_out_mp_mul_outdegree_mp_CP_,linkTransformedBalanced$binnedY)

plot(linkTransformedBalanced$log_OP_truncatedpagerank_4_mp_div_pagerank_mp_CP_,linkTransformedBalanced$binnedY)

plot(linkTransformedBalanced$L_avgin_of_out_hp,linkTransformedBalanced$binnedY)

plot(linkTransformedBalanced$log_OP_outdegree_hp_div_pagerank_hp_CP_,linkTransformedBalanced$binnedY)

plot(linkTransformedBalanced$L_outdegree_hp,linkTransformedBalanced$binnedY)

plot(linkTransformedBalanced$log_OP_avgin_of_out_hp_mul_outdegree_hp_CP_,linkTransformedBalanced$binnedY)

We can can see that on some charts that there are a few instances such a L_outdegree_mp there are a few instances on both spam and non-spam where it sits at -50. Let’s take a look at what those instances are and how many of them are spam and non-spam

sum(linkTransformedBalanced[linkTransformedBalanced$L_outdegree_mp < -40,]$V3 > 0.5)
## [1] 198
sum(linkTransformedBalanced[linkTransformedBalanced$L_outdegree_mp < -40,]$V3 < 0.5)
## [1] 75

There seems to be a skew of 1 value compared to the rest, as such, we’ll leave it in as it could be helpful for our algorithms later.

SOM Visualization

Let’s visualise it with an SOM to see how well defined clusters are.

#Colour palette definition
pretty_palette <- c("#1f77b4", '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b', '#e377c2')

# ------------------- SOM TRAINING ---------------------------
{
#choose the variables with which to train the SOM
#the following selects column 2,4,5,8
data_train <- linkTransformedBalanced[,headhead]

# now train the SOM using the Kohonen method
data_train_matrix <- as.matrix(scale(data_train))
names(data_train_matrix) <- names(data_train)
require(kohonen)
x_dim=15
y_dim=15
som_grid <- somgrid(xdim = x_dim, ydim=y_dim, topo="hexagonal")  
# Train the SOM model!
if (packageVersion("kohonen") < 3){
  system.time(som_model <- som(data_train_matrix, 
                               grid=som_grid, 
                               rlen=1000, 
                               alpha=c(0.9,0.01),
                               n.hood = "circular",
                               keep.data = TRUE ))
}else{
  system.time(som_model <- som(data_train_matrix, 
                               grid=som_grid, 
                               rlen=1000, 
                               alpha=c(0.9,0.01),
                               mode="online",
                               normalizeDataLayers=false,
                               keep.data = TRUE ))
}

plot(som_model, type = "changes")
#counts within nodes
plot(som_model, type = "counts", main="Node Counts")
#map quality
plot(som_model, type = "quality", main="Node Quality/Distance")
#neighbour distances
plot(som_model, type="dist.neighbours", main = "SOM neighbour distances", palette.name=grey.colors)
#code spread
plot(som_model, type = "codes")

plotHeatMap <- function(som_model, data, variable=0){    
  # Plot a heatmap for any variable from the data set "data".
  # If variable is 0, an interactive window will be provided to choose the variable.
  # If not, the variable in "variable" will be plotted.
  
  require(dummies)
  require(kohonen)
  
  interactive <- TRUE
  
  while (interactive == TRUE){
    
    if (variable == 0){
      #show interactive window.
      color_by_var <- select.list(names(data), multiple=FALSE,
                                  graphics=TRUE, 
                                  title="Choose variable to color map by.")
      # check for user finished.
      if (color_by_var == ""){ # if user presses Cancel - we quit function        
        return(TRUE)
      }
      interactive <- TRUE
      color_variable <- data.frame(data[, color_by_var])
      
    } else {
      color_variable <- data.frame(data[, variable])
      color_by_var <- names(data)[variable]
      interactive <- FALSE
    }
    
    #if the variable chosen is a string or factor - 
    #Get the levels and ask the user to choose which one they'd like.
    
    if (class(color_variable[,1]) %in% c("character", "factor", "logical")){
      #want to spread this out into dummy factors - but colour by one of those.
      temp_data <- dummy.data.frame(color_variable, sep="_")
      # print(temp_data)
      # chosen_factor <- select.list(names(temp_data),
      #                              multiple=FALSE,
      #                              graphics=TRUE,
      #                              title="Choose level of variable for colouring")
      # print(chosen_factor)
      chosen_factor <- "data...variable._Spam"
      color_variable <- temp_data[, chosen_factor]
      rm(temp_data, chosen_factor)
      color_by <- color_variable
    } else {      
      #impute the missing values with the mean.
      color_variable[is.na(color_variable[,1]),1] <- mean(color_variable[,1], na.rm=TRUE)
      #color_by <- capVector(color_variable[,1])
      #color_by <- scale(color_by)  
      color_by <- color_variable[,1]
    }
    unit_colors <- aggregate(color_by, by=list(som_model$unit.classif), FUN=mean, simplify=TRUE)
    plot(som_model, type = "property", property=unit_colors[,2], main=color_by_var)    
  }
}

plotHeatMap(som_model, linkTransformedBalanced, variable=140)

}

## Warning in model.matrix.default(~x - 1, model.frame(~x - 1), contrasts = FALSE):
## non-list contrasts argument ignored

## Warning in bgcolors[!is.na(showcolors)] <- bgcol[showcolors[!
## is.na(showcolors)]]: number of items to replace is not a multiple of replacement
## length

SOM Visualization Analysis

While there are clusters that are formed from our data, the clusters do not have distinct Spam or non-spam results, thus will likely result in our classification model not working well. We’ll proceed to try to classify, but before then we’ll split our data and prepare our testing data

# Loading some libraries here because if I load them before SOM it bugs out
# MLP, ROC
library(RSNNS)
## Loading required package: Rcpp
## 
## Attaching package: 'RSNNS'
## The following object is masked from 'package:kohonen':
## 
##     som
# DT
library(rpart)
# 
# Forest
library(randomForest)
## randomForest 4.7-1
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
## 
##     margin
# 
# Naive Bayes
library(e1071)
linkTransformedBalancedSelected <- linkTransformedBalanced[,c(headnames,"binnedY")]


split1<- sample(c(rep(0, 0.7 * nrow(linkTransformedBalancedSelected)), rep(1, 0.3 * nrow(linkTransformedBalancedSelected))))
trainDS <- linkTransformedBalancedSelected[split1 == 0, ]
testDS <- linkTransformedBalancedSelected[split1 == 1, ]  

# FOR LATER
linkTransformedBalancedTrain <- linkTransformedBalanced[split1 == 0, ]
linkTransformedBalancedTest <- linkTransformedBalanced[split1 == 1, ]

linkTransformedTest <-merge(testRAW,linkTransfromedRAW,by.x = "V1", by.y="X.hostid")
linkTransformedTest <- linkTransformedTest[,c(-1,-2,-4)]
linkTransformedTest$V3 <- as.numeric(as.character(linkTransformedTest$V3))
## Warning: NAs introduced by coercion
linkTransformedTest <- na.omit(linkTransformedTest)
linkTransformedTest$binnedY <- sapply(linkTransformedTest$V3, binningSpam)

Training Models on 10 most correlated attributes.

MLP Training

library(RSNNS)

linkTransformedBalanced$binnedY <- sapply(linkTransformedBalanced$V3, binningSpam)

trainValues <- linkTransformedBalanced[, headnames]
trainTargets <- decodeClassLabels(linkTransformedBalanced[,"binnedY"])

trainset <- splitForTrainingAndTest(trainValues, trainTargets, ratio=0.2)
trainset <- normTrainingAndTestSet(trainset)

model <- mlp(trainset$inputsTrain, trainset$targetsTrain, size=5, learnFuncParams=c(0.01), maxit=2000, inputsTest=trainset$inputsTest, targetsTest=trainset$targetsTest)
predictTestSet <- predict(model,trainset$inputsTest)

confusionMatrix(trainset$targetsTrain,fitted.values(model))
##        predictions
## targets   1   2
##       1 374  70
##       2 115 151
confusionMatrix(trainset$targetsTest,predictTestSet)
##        predictions
## targets  1  2
##       2 81 97
par(mar=c(5.1,4.1,4.1,2.1))
par(mfrow=c(2,2))
plotIterativeError(model)
plotRegressionError(predictTestSet[,2], trainset$targetsTest[,2])
plotROC(fitted.values(model)[,2], trainset$targetsTrain[,2])
plotROC(predictTestSet[,2], trainset$targetsTest[,2])

As we can see from the results, it doesn’t seem to work very well at all as it just classifies everything as spam. We tried running it with different tuning parameters (itterations, learning rate, activaiton functions), but none of it seemed to fix the issue.

We also didn’t bother calcualting the AUC for this graph as there isn’t any (0)

We’ll try it with other algorithms and also revisit it after transforming the data more.

Decision Tree

library(rpart)
library(party)
## Loading required package: grid
## Loading required package: mvtnorm
## Loading required package: modeltools
## Loading required package: stats4
## Loading required package: strucchange
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## Loading required package: sandwich
library(tree)
library(randomForest)
library(mltools)
## 
## Attaching package: 'mltools'
## The following object is masked from 'package:e1071':
## 
##     skewness
# More tuning parameters
# https://dzone.com/articles/decision-trees-and-pruning-in-r

Gini.DT.rpart <- rpart(binnedY ~ ., data = trainDS, parms=list(split = "gini"))
print(Gini.DT.rpart)
## n= 622 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
##   1) root 622 299 NonSpam (0.48070740 0.51929260)  
##     2) log_OP_outdegree_mp_div_pagerank_mp_CP_< 14.31755 202  54 Spam (0.73267327 0.26732673) *
##     3) log_OP_outdegree_mp_div_pagerank_mp_CP_>=14.31755 420 151 NonSpam (0.35952381 0.64047619)  
##       6) log_OP_truncatedpagerank_2_mp_div_pagerank_mp_CP_< -0.03106625 120  48 Spam (0.60000000 0.40000000)  
##        12) log_OP_outdegree_hp_div_pagerank_hp_CP_>=21.02365 31   2 Spam (0.93548387 0.06451613) *
##        13) log_OP_outdegree_hp_div_pagerank_hp_CP_< 21.02365 89  43 NonSpam (0.48314607 0.51685393)  
##          26) L_avgin_of_out_mp>=4.503144 33  10 Spam (0.69696970 0.30303030) *
##          27) L_avgin_of_out_mp< 4.503144 56  20 NonSpam (0.35714286 0.64285714)  
##            54) log_OP_avgin_of_out_mp_mul_outdegree_mp_CP_< 3.040749 35  16 Spam (0.54285714 0.45714286)  
##             108) log_OP_truncatedpagerank_3_mp_div_pagerank_mp_CP_< -0.4596983 23   6 Spam (0.73913043 0.26086957) *
##             109) log_OP_truncatedpagerank_3_mp_div_pagerank_mp_CP_>=-0.4596983 12   2 NonSpam (0.16666667 0.83333333) *
##            55) log_OP_avgin_of_out_mp_mul_outdegree_mp_CP_>=3.040749 21   1 NonSpam (0.04761905 0.95238095) *
##       7) log_OP_truncatedpagerank_2_mp_div_pagerank_mp_CP_>=-0.03106625 300  79 NonSpam (0.26333333 0.73666667)  
##        14) L_outdegree_mp>=3.737386 36  12 Spam (0.66666667 0.33333333)  
##          28) L_outdegree_mp< 4.2194 24   4 Spam (0.83333333 0.16666667) *
##          29) L_outdegree_mp>=4.2194 12   4 NonSpam (0.33333333 0.66666667) *
##        15) L_outdegree_mp< 3.737386 264  55 NonSpam (0.20833333 0.79166667)  
##          30) log_OP_avgin_of_out_mp_mul_outdegree_mp_CP_< 3.55494 64  24 NonSpam (0.37500000 0.62500000)  
##            60) log_OP_avgin_of_out_mp_mul_outdegree_mp_CP_>=2.517176 37  17 Spam (0.54054054 0.45945946)  
##             120) log_OP_outdegree_mp_div_pagerank_mp_CP_< 18.89231 19   5 Spam (0.73684211 0.26315789) *
##             121) log_OP_outdegree_mp_div_pagerank_mp_CP_>=18.89231 18   6 NonSpam (0.33333333 0.66666667) *
##            61) log_OP_avgin_of_out_mp_mul_outdegree_mp_CP_< 2.517176 27   4 NonSpam (0.14814815 0.85185185) *
##          31) log_OP_avgin_of_out_mp_mul_outdegree_mp_CP_>=3.55494 200  31 NonSpam (0.15500000 0.84500000)  
##            62) log_OP_truncatedpagerank_1_mp_div_pagerank_mp_CP_>=0.1475373 101  25 NonSpam (0.24752475 0.75247525)  
##             124) log_OP_outdegree_mp_div_pagerank_mp_CP_>=20.08747 8   1 Spam (0.87500000 0.12500000) *
##             125) log_OP_outdegree_mp_div_pagerank_mp_CP_< 20.08747 93  18 NonSpam (0.19354839 0.80645161) *
##            63) log_OP_truncatedpagerank_1_mp_div_pagerank_mp_CP_< 0.1475373 99   6 NonSpam (0.06060606 0.93939394) *
plot(Gini.DT.rpart)
text(Gini.DT.rpart)

IG.DT.rpart <- rpart(binnedY ~ ., data = trainDS, parms=list(split = "information"))
print(IG.DT.rpart)
## n= 622 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
##   1) root 622 299 NonSpam (0.4807074 0.5192926)  
##     2) log_OP_outdegree_mp_div_pagerank_mp_CP_< 14.31755 202  54 Spam (0.7326733 0.2673267) *
##     3) log_OP_outdegree_mp_div_pagerank_mp_CP_>=14.31755 420 151 NonSpam (0.3595238 0.6404762)  
##       6) log_OP_truncatedpagerank_2_mp_div_pagerank_mp_CP_< -0.03106625 120  48 Spam (0.6000000 0.4000000)  
##        12) log_OP_outdegree_mp_div_pagerank_mp_CP_>=21.32083 21   0 Spam (1.0000000 0.0000000) *
##        13) log_OP_outdegree_mp_div_pagerank_mp_CP_< 21.32083 99  48 Spam (0.5151515 0.4848485)  
##          26) L_avgin_of_out_mp>=4.503144 37  10 Spam (0.7297297 0.2702703) *
##          27) L_avgin_of_out_mp< 4.503144 62  24 NonSpam (0.3870968 0.6129032)  
##            54) L_avgin_of_out_mp< 0.9835562 32  13 Spam (0.5937500 0.4062500)  
##             108) log_OP_truncatedpagerank_2_mp_div_pagerank_mp_CP_< -0.3348512 22   5 Spam (0.7727273 0.2272727) *
##             109) log_OP_truncatedpagerank_2_mp_div_pagerank_mp_CP_>=-0.3348512 10   2 NonSpam (0.2000000 0.8000000) *
##            55) L_avgin_of_out_mp>=0.9835562 30   5 NonSpam (0.1666667 0.8333333) *
##       7) log_OP_truncatedpagerank_2_mp_div_pagerank_mp_CP_>=-0.03106625 300  79 NonSpam (0.2633333 0.7366667)  
##        14) L_outdegree_mp>=3.737386 36  12 Spam (0.6666667 0.3333333)  
##          28) L_outdegree_mp< 4.2194 24   4 Spam (0.8333333 0.1666667) *
##          29) L_outdegree_mp>=4.2194 12   4 NonSpam (0.3333333 0.6666667) *
##        15) L_outdegree_mp< 3.737386 264  55 NonSpam (0.2083333 0.7916667) *
plot(IG.DT.rpart)
text(IG.DT.rpart)

GiniDTPredict <- predict(Gini.DT.rpart, testDS, type="class")
InfomationGainDTPredict <- predict(IG.DT.rpart, testDS, type="class")

table(testDS$binnedY, GiniDTPredict)
##          GiniDTPredict
##           Spam NonSpam
##   Spam     113      32
##   NonSpam   35      86
table(testDS$binnedY, InfomationGainDTPredict)
##          InfomationGainDTPredict
##           Spam NonSpam
##   Spam     106      39
##   NonSpam   35      86
plotROC(as.integer(testDS$binnedY == "Spam"), as.integer(GiniDTPredict == "Spam"))

auc_roc(as.integer(testDS$binnedY == "Spam"), as.integer(GiniDTPredict == "Spam"))
## [1] 0.7461635
plotROC(as.integer(testDS$binnedY == "Spam"), as.integer(InfomationGainDTPredict == "Spam"))

auc_roc(as.integer(testDS$binnedY == "Spam"), as.integer(InfomationGainDTPredict == "Spam"))
## [1] 0.7198865

The results look promising with an area under curve of 72.0%.

Testing with DT on real data set

GiniDTPredict <- predict(Gini.DT.rpart, linkTransformedTest, type="class")
InfomationGainDTPredict <- predict(IG.DT.rpart, linkTransformedTest, type="class")

table(linkTransformedTest$binnedY, GiniDTPredict)
##          GiniDTPredict
##           Spam NonSpam
##   NonSpam  694    1292
##   Spam      78      44
table(linkTransformedTest$binnedY, InfomationGainDTPredict)
##          InfomationGainDTPredict
##           Spam NonSpam
##   NonSpam  622    1364
##   Spam      74      48
plotROC(as.integer(linkTransformedTest$binnedY == "Spam"), as.integer(GiniDTPredict == "Spam"))

auc_roc(as.integer(linkTransformedTest$binnedY == "Spam"), as.integer(GiniDTPredict == "Spam"))
## [1] 0.5340511
plotROC(as.integer(linkTransformedTest$binnedY == "Spam"), as.integer(InfomationGainDTPredict == "Spam"))

auc_roc(as.integer(linkTransformedTest$binnedY == "Spam"), as.integer(InfomationGainDTPredict == "Spam"))
## [1] 0.5361638

Unfortunately, we our AUC drops to just 53.6%.

Random Forest

forest <- randomForest(binnedY ~ ., data = trainDS, nodesize=1, ntree=10)
plot(forest)

forestPredict <- predict(forest, testDS, type="class")
table(testDS$binnedY, forestPredict)
##          forestPredict
##           Spam NonSpam
##   Spam     127      18
##   NonSpam   19     102
plotROC(as.integer(testDS$binnedY == "Spam"), as.integer(forestPredict == "Spam"))

auc_roc(as.integer(testDS$binnedY == "Spam"), as.integer(forestPredict == "Spam"))
## [1] 0.8599315

We seem to be getting a better result than DT.

Fine tuning RF

totalAttempts = data.frame(row.names=c("ntree","mtry","accuracy","precision","recall"))

# Commented out so that it doesn't run on Export. Results can be seen below.

# for (ntree in seq(5,100, by=5)){
#   for (mtry in c(2:10)){
#     forest <- randomForest(binnedY ~ ., data = trainDS, nodesize=1, ntree=ntree, mtry=mtry)
#     forestPredict <- predict(forest, testDS, type="class")
#     tempTable <- table(testDS$binnedY, forestPredict)
#     TP <- tempTable["Spam","Spam"]
#     FP <- tempTable["NonSpam","Spam"]
#     FN <- tempTable["Spam","NonSpam"]
#     TN <- tempTable["NonSpam","NonSpam"]
#     
#     accuracy <- (TP+TN)/(TP+TN+FP+FN)
#     precision <- (TP/(TP+FP))
#     recall <- (TP/(TP+FN))
#     
#     totalAttempts <- rbind(totalAttempts, 
#                            data.frame(ntree=ntree, 
#                                       mtry=mtry, 
#                                       accuracy=accuracy,
#                                       precision = precision,
#                                       recall=recall))
#   }
# }

We initially ran a ntree loop from 10-100, and 100-1000 and found that the smaller trees gave better results. We also found that a ntree size of around half the attributes gave the best results. Thus, based on our testing, we decided on a ntree size of 95 and mtry of 6.

forest <- randomForest(binnedY ~ ., data = trainDS, nodesize=1, ntree=95, mtry=6)

forestPredict <- predict(forest, testDS, type="class")
table(testDS$binnedY, forestPredict)
##          forestPredict
##           Spam NonSpam
##   Spam     121      24
##   NonSpam   15     106
plotROC(as.integer(testDS$binnedY == "Spam"), as.integer(forestPredict == "Spam"))

auc_roc(as.integer(testDS$binnedY == "Spam"), as.integer(forestPredict == "Spam"))
## [1] 0.8525452

We got a NOT better ROC curve of around 85.2% vs 85.9% There is still some randomness involved and due our small DS, any small shift can result in randomness can have huge changes

Testing RF on real test DS

forestPredict <- predict(forest, linkTransformedTest, type="class")
tempTable <- table(linkTransformedTest$binnedY, forestPredict)
tempTable
##          forestPredict
##           Spam NonSpam
##   NonSpam  440    1546
##   Spam      51      71
plotROC(as.integer(linkTransformedTest$binnedY == "Spam"), as.integer(forestPredict == "Spam"))

auc_roc(as.integer(linkTransformedTest$binnedY == "Spam"), as.integer(forestPredict == "Spam"))
## [1] 0.5299806

Unfortunately, the results do not transform to real world testing, with our AUC dropping to 53%

Naive Bayes

library(e1071)

naiveBayesModel <- naiveBayes(binnedY ~ ., data=trainDS)

naiveBayesPredict <-  predict(naiveBayesModel, testDS, type="class")
table(testDS$binnedY, naiveBayesPredict)
##          naiveBayesPredict
##           Spam NonSpam
##   Spam      68      77
##   NonSpam   25      96
plotROC(as.integer(testDS$binnedY == "Spam"), as.integer(naiveBayesPredict == "Spam"))

auc_roc(as.integer(testDS$binnedY == "Spam"), as.integer(naiveBayesPredict == "Spam"))
## [1] 0.643048

Naive Bayes gives us a AUC curve of 64% on our test data… Not the best, but not surprising as the attributes are likely not independent from each other (which is where Naive Bayes thrives in)

Let’s see how well it performs with the real data set

Testing NB on real test DS

naiveBayesPredict2 <-  predict(naiveBayesModel, linkTransformedTest, type="class")
tempTable <- table(linkTransformedTest$binnedY, naiveBayesPredict2)
tempTable
##          naiveBayesPredict2
##           Spam NonSpam
##   NonSpam  386    1600
##   Spam      53      69
plotROC(as.integer(linkTransformedTest$binnedY == "Spam"), as.integer(naiveBayesPredict2 == "Spam"))

auc_roc(as.integer(linkTransformedTest$binnedY == "Spam"), as.integer(naiveBayesPredict2 == "Spam"))
## [1] 0.5396934

We get a AUC of 54%

Training RF on more different attributes

In attempt to improve our RF Model, we will try a different set of attributes:

The following will be attempted - Using Top 20 Correlated - Whole Data set (with parameter adjustment through rfcv) - Top 20 most important (based on what we got from the Whole Data Set) - Once hyper parameters are fine tuned, training on the whole train DS and using that on our test DS

Top 20 Correlated

headnames2 <- append(headnames20,"binnedY")

trainDS.top20 = linkTransformedBalancedTrain[,headnames2]
testDS.top20 = linkTransformedBalancedTest[,headnames2]

## BRUTE FORCE

# {
#   totalAttempts = data.frame(row.names=c("ntree","mtry","accuracy","precision","recall"))
#   
#   for (ntree in seq(200,2000, by=100)){
#     for (mtry in c(2:20)){
#       forest <- randomForest(binnedY ~ ., data = trainDS.top20, nodesize=1, ntree=ntree, mtry=mtry)
#       forestPredict <- predict(forest, testDS.top20, type="class")
#       tempTable <- table(testDS.top20$binnedY, forestPredict)
#     TP <- tempTable["Spam","Spam"]
#     FP <- tempTable["NonSpam","Spam"]
#     FN <- tempTable["Spam","NonSpam"]
#     TN <- tempTable["NonSpam","NonSpam"]
#       
#       accuracy <- (TP+TN)/(TP+TN+FP+FN)
#       precision <- (TP/(TP+FP))
#       recall <- (TP/(TP+FN))
#       
#       totalAttempts <- rbind(totalAttempts, 
#                              data.frame(ntree=ntree, 
#                                         mtry=mtry, 
#                                         accuracy=accuracy,
#                                         precision = precision,
#                                         recall=recall))
#     }
#   }
# }

# Best result was ntree=75, mtry=7

forest20 <- randomForest(binnedY ~ ., data = trainDS.top20, nodesize=1, ntree=75, mtry=7)
forestPredict20 <- predict(forest20, testDS.top20, type="class")
plot(forest20)

table(testDS.top20$binnedY, forestPredict20)
##          forestPredict20
##           Spam NonSpam
##   Spam     124      21
##   NonSpam   17     104
plotROC(as.integer(testDS.top20$binnedY == "Spam"), as.integer(forestPredict20 == "Spam"))

auc_roc(as.integer(testDS.top20$binnedY == "Spam"), as.integer(forestPredict20 == "Spam"))
## [1] 0.8557163

We get 85.6%, not much better, if anything it’s run to run variances.

Results on Final DS

forestPredict20 <- predict(forest20, linkTransformedTest, type="class")
table(linkTransformedTest$binnedY, forestPredict20)
##          forestPredict20
##           Spam NonSpam
##   NonSpam  452    1534
##   Spam      54      68
plotROC(as.integer(linkTransformedTest$binnedY == "Spam"), as.integer(forestPredict20 == "Spam"))

auc_roc(as.integer(linkTransformedTest$binnedY == "Spam"), as.integer(forestPredict20 == "Spam"))
## [1] 0.5321362

There seems to be a slight improvement, but nothing major over our original. (within 1% margins)

Training on the whole DS

After more research was done on RF, it was noticed that attribute selection may not be as important for RF, as it will typically use gini index, infomation gain, or some other splitting algorithm to figure out which variable to use. Thus, with fine tuning, we could in theory just shove the whole DS into a Forest and have it work

Finding parameters

First, we would want to know how much attributes we should use first. We can use rfcv() to figure out which one works best.

linkTransformedBalanced$binnedY <- sapply(linkTransformedBalanced$V3, binningSpam)
linkTransformedBalancedY <- linkTransformedBalanced$binnedY
linkTransformedBalancedX <- subset(linkTransformedBalanced,select=-c(binnedY,V3))

linkTransformedTrain$binnedY <- sapply(linkTransformedTrain$V3, binningSpam)
linkTransformedTrainY <- linkTransformedTrain$binnedY
linkTransformedTrainX <- subset(linkTransformedTrain,select=-c(binnedY,V3))

result <- rfcv(linkTransformedBalancedX,linkTransformedBalancedY, recursive = T)
with(result, plot(n.var, error.cv, log="x", type="o", lwd=2))

result <- rfcv(linkTransformedTrainX,linkTransformedTrainY, recursive = T)
with(result, plot(n.var, error.cv, log="x", type="o", lwd=2))

From our results, we can see that if we were to train on our 70% split, 10-15 would be better. But if we were to test on our full training data, 5 would be better

We can then attempt to brute force again to find how big should our tree go

linkTransformedBalancedNoFactor <- subset(linkTransformedBalancedTrain, select=-c(V3))
trainDS.all <- subset(linkTransformedBalancedTrain, select=-c(V3))
testDS.all = linkTransformedBalancedTest

# DO NOT RUN AS EXECUTION CAN BE SLOW
# {
#   totalAttempts = data.frame(row.names=c("ntree","mtry","accuracy","precision","recall"))
#   
#   for (ntree in seq(10,200, by=10)){
#     for (mtry in c(2:20)){
#       forest <- randomForest(binnedY ~ ., data = trainDS.all, nodesize=1, ntree=ntree, mtry=mtry)
#       forestPredict <- predict(forest, testDS.all, type="class")
#       tempTable <- table(testDS.all$binnedY, forestPredict)
#       TP <- tempTable[1,1]
#       FP <- tempTable[2,1]
#       FN <- tempTable[1,2]
#       TN <- tempTable[2,2]
#       
#       accuracy <- (TP+TN)/(TP+TN+FP+FN)
#       precision <- (TP/(TP+FP))
#       recall <- (TP/(TP+FN))
#       
#       totalAttempts <- rbind(totalAttempts, 
#                              data.frame(ntree=ntree, 
#                                         mtry=mtry, 
#                                         accuracy=accuracy,
#                                         precision = precision,
#                                         recall=recall))
#     }
#   }
# }

The “best” attributes based on our testing above was the following: - node size 1 - amount of trees to use: 120 (makes sense, as even though many places recommend a high number, we have little samples) - attributes to be used: 13

forest.All <- randomForest(binnedY ~ ., data = trainDS.all, nodesize=1, ntree=120, mtry=13)
plot(forest.All)

forestPredict.All <- predict(forest.All, testDS.all, type="class")
tempTable.All <- table(testDS.all$binnedY, forestPredict.All)
tempTable.All
##          forestPredict.All
##           Spam NonSpam
##   Spam     126      19
##   NonSpam   20     101
plotROC(as.integer(testDS.all$binnedY == "Spam"), as.integer(forestPredict.All == "Spam"))

auc_roc(as.integer(testDS.all$binnedY == "Spam"), as.integer(forestPredict.All == "Spam"))
## [1] 0.8523402

We got 85.2%. Not much better or worse sadly

Testing on the test DS (based on our 70% split)

forestPredict.All.Testing <- predict(forest.All, linkTransformedTest, type="class")
table(linkTransformedTest$binnedY, forestPredict.All.Testing)
##          forestPredict.All.Testing
##           Spam NonSpam
##   NonSpam  326    1660
##   Spam      53      69
plotROC(as.integer(linkTransformedTest$binnedY == "Spam"), as.integer(forestPredict.All.Testing == "Spam"))

auc_roc(as.integer(linkTransformedTest$binnedY == "Spam"), as.integer(forestPredict.All.Testing == "Spam"))
## [1] 0.5499671

We got a higher AUC area of 55%, but nothing fantastic.

Testing on the test DS (based on all our training data)

Because we just might have too little data, we are going to attempt to use the parameters we set and train it on all the data we have and see if it nets us a better result

trainDS.all.all <- subset(linkTransformedBalanced, select=-c(V3))

forest.All.All <- randomForest(binnedY ~ ., data = trainDS.all.all, nodesize=1, ntree=120, mtry=13)

plot(forest.All.All)

forestPredict.All.All.Testing <- predict(forest.All.All, linkTransformedTest, type="class")
table(linkTransformedTest$binnedY, forestPredict.All.All.Testing)
##          forestPredict.All.All.Testing
##           Spam NonSpam
##   NonSpam  287    1699
##   Spam      46      76
plotROC(as.integer(linkTransformedTest$binnedY == "Spam"), as.integer(forestPredict.All.All.Testing == "Spam"))

auc_roc(as.integer(linkTransformedTest$binnedY == "Spam"), as.integer(forestPredict.All.All.Testing == "Spam"))
## [1] 0.5476606

There is a tad amount of improvement, but nothing major either.

Training on “important” attributes based on the training the whole DS.

This might seem a bit redundant, but just as a check we tried training it using what the gini split thought was “Important” based on the training DS above.

importanceRankAll <- data.frame(importance(forest.All))
importanceRankAllNames <- row.names(importanceRankAll)[order(importanceRankAll, decreasing=T)]
## Warning in xtfrm.data.frame(x): cannot xtfrm data frames
importanceRankAllNames.Top20 <- importanceRankAllNames[1:21]
linkTransformedBalanced.AllSelected <- linkTransformedBalanced[,c(importanceRankAllNames.Top20,"binnedY")]

forestAll2 <- randomForest(binnedY ~ ., data = linkTransformedBalanced.AllSelected, nodesize=1, ntree=50, mtry=10)
plot(forestAll2)

forestPredict.top20important <- predict(forestAll2, linkTransformedTest, type="class")
tempTable.top20important <- table(linkTransformedTest$binnedY, forestPredict.top20important)
tempTable.top20important
##          forestPredict.top20important
##           Spam NonSpam
##   NonSpam  380    1606
##   Spam      62      60
plotROC(as.integer(linkTransformedTest$binnedY == "Spam"), as.integer(forestPredict.top20important == "Spam"))

auc_roc(as.integer(linkTransformedTest$binnedY == "Spam"), as.integer(forestPredict.top20important == "Spam"))
## [1] 0.5521285

As expected, we got a very similar score compared to training on the whole DS. Makes sense as it will converge at a point.

Training without balancing

We tried training the system without balancing the data set (thus a lot more spam), and it didn’t work.

linkTransformedTrain.noBalance <- subset(linkTransformedTrain, select=-c(V3)) 
forest.NoBalanceAll <- randomForest(binnedY ~ ., data = linkTransformedTrain.noBalance, nodesize=1, ntree=2000, mtry=13)
forestPredict.NoBalanceAll <- predict(forest.NoBalanceAll, linkTransformedTest, type="class")
tempTable.NoBalanceAll <- table(linkTransformedTest$binnedY, forestPredict.NoBalanceAll)
tempTable.NoBalanceAll
##          forestPredict.NoBalanceAll
##           NonSpam Spam
##   NonSpam    1981    5
##   Spam        115    7

The input data to a Forest must be at least somewhat balanced for it to work well.

Adding PCA to reduce dimensions

As our attribute selection did not seem to help our models, we tried a different way to reduce the dimensions in hope of finding better clusters and patterns for our models to pickup using PCA.

As such, for our next test, we decided to PCA the whole DS and pass the new attributes to our models.

Due to time constraints, we were unable to selectively PCA specific columns instead of all the attributes at once. This will probably heavily affect our Naive Bayes model.

linkTransformedTrain.X <- subset(linkTransformedTrain,select=-c(binnedY,V3))
linkTransformedTrain.PCAParams <- prcomp(linkTransformedTrain.X, center = TRUE, scale = TRUE)
summary(linkTransformedTrain.PCAParams)
## Importance of components:
##                           PC1    PC2     PC3     PC4     PC5     PC6     PC7
## Standard deviation     6.5918 4.3415 3.06635 2.66024 2.43142 2.27430 2.25803
## Proportion of Variance 0.3149 0.1366 0.06813 0.05128 0.04284 0.03748 0.03695
## Cumulative Proportion  0.3149 0.4515 0.51958 0.57086 0.61370 0.65118 0.68813
##                            PC8     PC9    PC10    PC11    PC12   PC13    PC14
## Standard deviation     1.88302 1.86017 1.70913 1.63723 1.51857 1.4860 1.38040
## Proportion of Variance 0.02569 0.02507 0.02117 0.01942 0.01671 0.0160 0.01381
## Cumulative Proportion  0.71382 0.73890 0.76006 0.77949 0.79620 0.8122 0.82601
##                           PC15    PC16    PC17    PC18    PC19    PC20    PC21
## Standard deviation     1.36310 1.30581 1.26299 1.22541 1.13701 1.08186 1.02919
## Proportion of Variance 0.01346 0.01236 0.01156 0.01088 0.00937 0.00848 0.00768
## Cumulative Proportion  0.83947 0.85183 0.86339 0.87427 0.88364 0.89212 0.89979
##                           PC22    PC23    PC24    PC25    PC26    PC27    PC28
## Standard deviation     0.98603 0.95545 0.93173 0.90426 0.87994 0.82567 0.81727
## Proportion of Variance 0.00705 0.00662 0.00629 0.00593 0.00561 0.00494 0.00484
## Cumulative Proportion  0.90684 0.91346 0.91975 0.92567 0.93128 0.93622 0.94106
##                           PC29    PC30    PC31    PC32   PC33    PC34    PC35
## Standard deviation     0.79319 0.75156 0.67924 0.65971 0.6439 0.63329 0.61288
## Proportion of Variance 0.00456 0.00409 0.00334 0.00315 0.0030 0.00291 0.00272
## Cumulative Proportion  0.94562 0.94971 0.95306 0.95621 0.9592 0.96212 0.96484
##                           PC36    PC37    PC38    PC39    PC40    PC41    PC42
## Standard deviation     0.59415 0.57943 0.56250 0.54235 0.53902 0.53239 0.50770
## Proportion of Variance 0.00256 0.00243 0.00229 0.00213 0.00211 0.00205 0.00187
## Cumulative Proportion  0.96740 0.96983 0.97213 0.97426 0.97636 0.97842 0.98029
##                           PC43    PC44   PC45    PC46    PC47    PC48    PC49
## Standard deviation     0.48273 0.46353 0.4556 0.44416 0.41394 0.38126 0.37694
## Proportion of Variance 0.00169 0.00156 0.0015 0.00143 0.00124 0.00105 0.00103
## Cumulative Proportion  0.98197 0.98353 0.9850 0.98647 0.98771 0.98876 0.98979
##                           PC50    PC51    PC52    PC53    PC54    PC55    PC56
## Standard deviation     0.35993 0.35001 0.34516 0.33563 0.31415 0.30306 0.29881
## Proportion of Variance 0.00094 0.00089 0.00086 0.00082 0.00072 0.00067 0.00065
## Cumulative Proportion  0.99073 0.99162 0.99248 0.99330 0.99401 0.99468 0.99532
##                           PC57    PC58    PC59    PC60    PC61    PC62    PC63
## Standard deviation     0.27676 0.26684 0.25105 0.24432 0.23228 0.22231 0.20528
## Proportion of Variance 0.00056 0.00052 0.00046 0.00043 0.00039 0.00036 0.00031
## Cumulative Proportion  0.99588 0.99639 0.99685 0.99728 0.99767 0.99803 0.99834
##                           PC64    PC65    PC66    PC67    PC68    PC69    PC70
## Standard deviation     0.19499 0.18493 0.17233 0.15592 0.12342 0.11258 0.11147
## Proportion of Variance 0.00028 0.00025 0.00022 0.00018 0.00011 0.00009 0.00009
## Cumulative Proportion  0.99861 0.99886 0.99908 0.99925 0.99936 0.99946 0.99955
##                           PC71    PC72    PC73    PC74    PC75    PC76    PC77
## Standard deviation     0.10584 0.09951 0.09206 0.07959 0.06897 0.06478 0.05348
## Proportion of Variance 0.00008 0.00007 0.00006 0.00005 0.00003 0.00003 0.00002
## Cumulative Proportion  0.99963 0.99970 0.99976 0.99981 0.99984 0.99987 0.99989
##                           PC78    PC79    PC80    PC81    PC82    PC83    PC84
## Standard deviation     0.05020 0.04994 0.04347 0.04008 0.03852 0.03527 0.03083
## Proportion of Variance 0.00002 0.00002 0.00001 0.00001 0.00001 0.00001 0.00001
## Cumulative Proportion  0.99991 0.99993 0.99994 0.99995 0.99996 0.99997 0.99998
##                           PC85    PC86   PC87    PC88    PC89    PC90    PC91
## Standard deviation     0.02876 0.02384 0.0203 0.01758 0.01492 0.01376 0.01023
## Proportion of Variance 0.00001 0.00000 0.0000 0.00000 0.00000 0.00000 0.00000
## Cumulative Proportion  0.99999 0.99999 1.0000 0.99999 1.00000 1.00000 1.00000
##                            PC92     PC93     PC94     PC95     PC96     PC97
## Standard deviation     0.008623 0.005907 0.005509 0.005442 0.004364 0.003708
## Proportion of Variance 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
## Cumulative Proportion  1.000000 1.000000 1.000000 1.000000 1.000000 1.000000
##                            PC98     PC99    PC100    PC101     PC102     PC103
## Standard deviation     0.003044 0.002405 0.002306 0.001149 5.371e-14 2.335e-14
## Proportion of Variance 0.000000 0.000000 0.000000 0.000000 0.000e+00 0.000e+00
## Cumulative Proportion  1.000000 1.000000 1.000000 1.000000 1.000e+00 1.000e+00
##                            PC104     PC105     PC106     PC107     PC108
## Standard deviation     1.592e-14 1.529e-14 1.282e-14 9.225e-15 9.009e-15
## Proportion of Variance 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00
## Cumulative Proportion  1.000e+00 1.000e+00 1.000e+00 1.000e+00 1.000e+00
##                            PC109     PC110     PC111     PC112     PC113
## Standard deviation     7.932e-15 6.829e-15 6.678e-15 6.279e-15 5.686e-15
## Proportion of Variance 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00
## Cumulative Proportion  1.000e+00 1.000e+00 1.000e+00 1.000e+00 1.000e+00
##                            PC114     PC115     PC116     PC117     PC118
## Standard deviation     5.294e-15 4.613e-15 4.159e-15 4.099e-15 3.918e-15
## Proportion of Variance 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00
## Cumulative Proportion  1.000e+00 1.000e+00 1.000e+00 1.000e+00 1.000e+00
##                           PC119     PC120     PC121     PC122     PC123
## Standard deviation     3.88e-15 3.388e-15 3.068e-15 2.757e-15 2.258e-15
## Proportion of Variance 0.00e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00
## Cumulative Proportion  1.00e+00 1.000e+00 1.000e+00 1.000e+00 1.000e+00
##                            PC124    PC125     PC126     PC127     PC128
## Standard deviation     2.154e-15 1.87e-15 1.803e-15 1.776e-15 1.558e-15
## Proportion of Variance 0.000e+00 0.00e+00 0.000e+00 0.000e+00 0.000e+00
## Cumulative Proportion  1.000e+00 1.00e+00 1.000e+00 1.000e+00 1.000e+00
##                            PC129     PC130     PC131     PC132     PC133
## Standard deviation     1.427e-15 1.406e-15 1.375e-15 1.327e-15 1.173e-15
## Proportion of Variance 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00
## Cumulative Proportion  1.000e+00 1.000e+00 1.000e+00 1.000e+00 1.000e+00
##                            PC134     PC135     PC136     PC137     PC138
## Standard deviation     9.589e-16 8.874e-16 8.296e-16 6.298e-16 4.909e-16
## Proportion of Variance 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00
## Cumulative Proportion  1.000e+00 1.000e+00 1.000e+00 1.000e+00 1.000e+00

We decided to use all the way to PCA 13 to capture ~80% of the variance in our data

linkTransformedTrain.PCA <- data.frame(linkTransformedTrain$binnedY ,linkTransformedTrain.PCAParams$x)[,c(1:14)]
names(linkTransformedTrain.PCA)[1] <- "binnedY"

# TRANSFORMING PCA TEST
linkTransformedTest.x <- subset(linkTransformedTest,select=-c(binnedY,V3))
linkTransformedTest.PCA <- scale(linkTransformedTest.x) %*% linkTransformedTrain.PCAParams$rotation
linkTransformedTest.PCA <- data.frame(linkTransformedTest$binnedY, linkTransformedTest.PCA)
names(linkTransformedTest.PCA)[1] <- "binnedY"

We can then proceed with the same methods we used, but instead we’ll be passing in all the columns

# Creating Balanced DS, shuffling
{
  linkTransformedTrainSpam.PCA <- subset(linkTransformedTrain.PCA, binnedY == "Spam")
  linkTransformedTrainSpam.PCA <- rbind(linkTransformedTrainSpam.PCA,linkTransformedTrainSpam.PCA)
  linkTransformedTrainNotSpam.PCA <- subset(linkTransformedTrain.PCA, binnedY == "NonSpam")
  linkTransformedTrainNotSpam.PCA <- linkTransformedTrainNotSpam.PCA[sample(1:nrow(linkTransformedTrainNotSpam.PCA), size=nrow(linkTransformedTrainSpam.PCA), replace=F),]
  
  linkTransformedBalanced.PCA = rbind(linkTransformedTrainSpam.PCA, linkTransformedTrainNotSpam.PCA)
}

# Shuffle

rows <- sample(nrow(linkTransformedBalanced.PCA))
linkTransformedBalanced.PCA <- linkTransformedBalanced.PCA[rows,]

split1<- sample(c(rep(0, 0.7 * nrow(linkTransformedBalanced.PCA)),
                  rep(1, 0.3 * nrow(linkTransformedBalanced.PCA))))

linkTransformedBalancedTrain.PCA <- linkTransformedBalanced.PCA[split1==0,]
linkTransformedBalancedTest.PCA <- linkTransformedBalanced.PCA[split1==1,]

Training with MLP

library(RSNNS)

binningSpam <- function(value){
  if (value> 0.5){
    return(factor("Spam"))
  }
  else{
    return(factor("NonSpam"))
  }
}

trainValues <- linkTransformedBalanced.PCA
trainValues$binnedY = NULL
trainTargets <- decodeClassLabels(linkTransformedBalanced.PCA[,"binnedY"])

trainset <- splitForTrainingAndTest(trainValues, trainTargets, ratio=0.2)
trainset <- normTrainingAndTestSet(trainset)

model <- mlp(trainset$inputsTrain, trainset$targetsTrain, size=c(20), learnFuncParams=c(0.001), maxit=4000, inputsTest=trainset$inputsTest, targetsTest=trainset$targetsTest)
predictTestSet <- predict(model,trainset$inputsTest)

confusionMatrix(trainset$targetsTrain,fitted.values(model))
##        predictions
## targets   1   2
##       1 267  86
##       2  87 270
confusionMatrix(trainset$targetsTest,predictTestSet)
##        predictions
## targets  1  2
##       1 60 31
##       2 27 60
par(mar=c(5.1,4.1,4.1,2.1))
par(mfrow=c(2,2))
plotIterativeError(model)
plotRegressionError(predictTestSet[,2], trainset$targetsTest[,2])
plotROC(fitted.values(model)[,2], trainset$targetsTrain[,2])
plotROC(predictTestSet[,2], trainset$targetsTest[,2])

predictTestSet <- predict(model,linkTransformedTest.PCA[2:14]) # IF ADJUSTING LATER
confusionMatrix(linkTransformedTest.PCA$binnedY,predictTestSet)
##        predictions
## targets    1    2
##       1 1037  949
##       2   27   95
plotROC(as.integer(linkTransformedTest.PCA$binnedY == "Spam"), predictTestSet[,2])
auc_roc(as.integer(linkTransformedTest.PCA$binnedY == "Spam"), predictTestSet[,2])
## [1] 0.5235126

MLP now works! But our results aren’t exactly fantastic with a AUC of 52.3%

Training with DT

Gini.DT.rpart.PCA <- rpart(binnedY ~ ., data = linkTransformedBalancedTrain.PCA, parms=list(split = "gini"), control =list(maxdepth = 7))
GiniDTPredict.PCA <- predict(Gini.DT.rpart.PCA, linkTransformedBalancedTest.PCA, type="class")
table(linkTransformedBalancedTest.PCA$binnedY, GiniDTPredict.PCA)
##          GiniDTPredict.PCA
##           NonSpam Spam
##   NonSpam      89   34
##   Spam         52   91
plotROC(as.integer(linkTransformedBalancedTest.PCA$binnedY == "Spam"), as.integer(GiniDTPredict.PCA == "Spam"))

auc_roc(as.integer(linkTransformedBalancedTest.PCA$binnedY == "Spam"), as.integer(GiniDTPredict.PCA == "Spam"))
## [1] 0.6796028
GiniDTPredict.Final.PCA <- predict(Gini.DT.rpart.PCA, linkTransformedTest.PCA, type="class")
table(linkTransformedTest.PCA$binnedY, GiniDTPredict.Final.PCA)
##          GiniDTPredict.Final.PCA
##           NonSpam Spam
##   NonSpam    1470  516
##   Spam         51   71
plotROC(as.integer(linkTransformedTest.PCA$binnedY == "Spam"), as.integer(GiniDTPredict.Final.PCA == "Spam"))

auc_roc(as.integer(linkTransformedTest.PCA$binnedY == "Spam"), as.integer(GiniDTPredict.Final.PCA == "Spam"))
## [1] 0.5437117
print(Gini.DT.rpart)
## n= 622 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
##   1) root 622 299 NonSpam (0.48070740 0.51929260)  
##     2) log_OP_outdegree_mp_div_pagerank_mp_CP_< 14.31755 202  54 Spam (0.73267327 0.26732673) *
##     3) log_OP_outdegree_mp_div_pagerank_mp_CP_>=14.31755 420 151 NonSpam (0.35952381 0.64047619)  
##       6) log_OP_truncatedpagerank_2_mp_div_pagerank_mp_CP_< -0.03106625 120  48 Spam (0.60000000 0.40000000)  
##        12) log_OP_outdegree_hp_div_pagerank_hp_CP_>=21.02365 31   2 Spam (0.93548387 0.06451613) *
##        13) log_OP_outdegree_hp_div_pagerank_hp_CP_< 21.02365 89  43 NonSpam (0.48314607 0.51685393)  
##          26) L_avgin_of_out_mp>=4.503144 33  10 Spam (0.69696970 0.30303030) *
##          27) L_avgin_of_out_mp< 4.503144 56  20 NonSpam (0.35714286 0.64285714)  
##            54) log_OP_avgin_of_out_mp_mul_outdegree_mp_CP_< 3.040749 35  16 Spam (0.54285714 0.45714286)  
##             108) log_OP_truncatedpagerank_3_mp_div_pagerank_mp_CP_< -0.4596983 23   6 Spam (0.73913043 0.26086957) *
##             109) log_OP_truncatedpagerank_3_mp_div_pagerank_mp_CP_>=-0.4596983 12   2 NonSpam (0.16666667 0.83333333) *
##            55) log_OP_avgin_of_out_mp_mul_outdegree_mp_CP_>=3.040749 21   1 NonSpam (0.04761905 0.95238095) *
##       7) log_OP_truncatedpagerank_2_mp_div_pagerank_mp_CP_>=-0.03106625 300  79 NonSpam (0.26333333 0.73666667)  
##        14) L_outdegree_mp>=3.737386 36  12 Spam (0.66666667 0.33333333)  
##          28) L_outdegree_mp< 4.2194 24   4 Spam (0.83333333 0.16666667) *
##          29) L_outdegree_mp>=4.2194 12   4 NonSpam (0.33333333 0.66666667) *
##        15) L_outdegree_mp< 3.737386 264  55 NonSpam (0.20833333 0.79166667)  
##          30) log_OP_avgin_of_out_mp_mul_outdegree_mp_CP_< 3.55494 64  24 NonSpam (0.37500000 0.62500000)  
##            60) log_OP_avgin_of_out_mp_mul_outdegree_mp_CP_>=2.517176 37  17 Spam (0.54054054 0.45945946)  
##             120) log_OP_outdegree_mp_div_pagerank_mp_CP_< 18.89231 19   5 Spam (0.73684211 0.26315789) *
##             121) log_OP_outdegree_mp_div_pagerank_mp_CP_>=18.89231 18   6 NonSpam (0.33333333 0.66666667) *
##            61) log_OP_avgin_of_out_mp_mul_outdegree_mp_CP_< 2.517176 27   4 NonSpam (0.14814815 0.85185185) *
##          31) log_OP_avgin_of_out_mp_mul_outdegree_mp_CP_>=3.55494 200  31 NonSpam (0.15500000 0.84500000)  
##            62) log_OP_truncatedpagerank_1_mp_div_pagerank_mp_CP_>=0.1475373 101  25 NonSpam (0.24752475 0.75247525)  
##             124) log_OP_outdegree_mp_div_pagerank_mp_CP_>=20.08747 8   1 Spam (0.87500000 0.12500000) *
##             125) log_OP_outdegree_mp_div_pagerank_mp_CP_< 20.08747 93  18 NonSpam (0.19354839 0.80645161) *
##            63) log_OP_truncatedpagerank_1_mp_div_pagerank_mp_CP_< 0.1475373 99   6 NonSpam (0.06060606 0.93939394) *

Unfortunately, even after optimization (we looped through different depths and types), we are getting 54.3% on the final DS.

Training with Forest

trainDS.all.all <- linkTransformedBalanced.PCA

forest.All.All <- randomForest(binnedY ~ ., data = trainDS.all.all, nodesize=1, ntree=120, mtry=5)

plot(forest.All.All)

forestPredict.All.All.Testing <- predict(forest.All.All, linkTransformedTest.PCA, type="class")
table(linkTransformedTest.PCA$binnedY, forestPredict.All.All.Testing)
##          forestPredict.All.All.Testing
##           NonSpam Spam
##   NonSpam    1535  451
##   Spam         56   66
plotROC(as.integer(linkTransformedTest.PCA$binnedY == "Spam"), as.integer(forestPredict.All.All.Testing == "Spam"))

auc_roc(as.integer(linkTransformedTest.PCA$binnedY == "Spam"), as.integer(forestPredict.All.All.Testing == "Spam"))
## [1] 0.5462308

54.6% AUC

Training with Naive Bayes

naiveBayesModel.PCA <- naiveBayes(binnedY ~ ., data=linkTransformedBalancedTrain.PCA)

naiveBayesPredict.PCA <-  predict(naiveBayesModel.PCA, linkTransformedBalancedTest.PCA, type="class")
table(linkTransformedBalancedTest.PCA$binnedY, naiveBayesPredict.PCA)
##          naiveBayesPredict.PCA
##           NonSpam Spam
##   NonSpam     106   17
##   Spam         91   52
plotROC(as.integer(linkTransformedBalancedTest.PCA$binnedY == "Spam"), as.integer(naiveBayesPredict.PCA == "Spam"))

auc_roc(as.integer(linkTransformedBalancedTest.PCA$binnedY == "Spam"), as.integer(naiveBayesPredict.PCA == "Spam"))
## [1] 0.6458471
naiveBayesPredict.Testing.PCA <-  predict(naiveBayesModel.PCA, linkTransformedTest.PCA, type="class")
table(linkTransformedTest.PCA$binnedY, naiveBayesPredict.Testing.PCA)
##          naiveBayesPredict.Testing.PCA
##           NonSpam Spam
##   NonSpam    1638  348
##   Spam         76   46
plotROC(as.integer(linkTransformedTest.PCA$binnedY == "Spam"), as.integer(naiveBayesPredict.Testing.PCA == "Spam"))

auc_roc(as.integer(linkTransformedTest.PCA$binnedY == "Spam"), as.integer(naiveBayesPredict.Testing.PCA == "Spam"))
## [1] 0.5362053

As expected, It performed worse at 53.6% AUC